videogames.df <- read.csv(file.path(project.dir, dataset.dir, 'vgsales-12-4-2019.csv'))
colnames(videogames.df)
## [1] "Rank" "Name" "basename" "Genre"
## [5] "ESRB_Rating" "Platform" "Publisher" "Developer"
## [9] "VGChartz_Score" "Critic_Score" "User_Score" "Total_Shipped"
## [13] "Global_Sales" "NA_Sales" "PAL_Sales" "JP_Sales"
## [17] "Other_Sales" "Year" "Last_Update" "url"
## [21] "status" "Vgchartzscore" "img_url"
# Since the data was collected in April of 2019, we are excluding games with year = 2019 since it does not give a comprehensive picture of all the sales during 2019.
videogames.clean <- videogames.df %>% filter(Year < 2019)
# E was originally called KA for ESRB ratings, so we are going to make all the KA ratings E
videogames.clean <- videogames.clean %>% mutate(ESRB_Rating = replace(ESRB_Rating, ESRB_Rating=='KA', 'E'))
# Make give the ESRB rating levels for easier graphing/ data manipulation
unique(videogames.clean$ESRB_Rating)
## [1] "E" "" "M" "E10" "T" "RP" "EC" "AO"
videogames.clean$ESRB_Rating <- factor(videogames.clean$ESRB_Rating,levels = c('','RP','E', 'EC', 'E10','T','M','AO'))
We want to compare sales across different regions, so it would be convenient to have one column ‘region’ and then a corresponding column for sales in USD (millions).
vs_byregion <- videogames.clean %>% gather(Region, Sales, Global_Sales:Other_Sales, na.rm = T)
Conduct some descriptive analysis on the data, figuring out: * distributions of variables, * variables that appear to be strongly related with each other (using appropriate methods to quantify the relationships based on whether variables are numerical or categorical).
From the boxplot we can see that we have 2 extreme outliers. After investigating, it looks like two outliers are GTA V (ps3 and ps4)
boxplot(videogames.clean$Global_Sales, xlab = 'Global Sales (millions of USD)')
videogames.clean[which(videogames.clean$Global_Sales > 17), ]
hist(videogames.clean$Global_Sales,
xlab = 'Global Sales (millions of USD)',
xlim = c(0, .5),
breaks = 2000)
ggplotly(
videogames.clean %>%
count(Platform, sort = TRUE) %>%
ggplot(aes(x = reorder(Platform, -n), y = n)) +
geom_bar(stat = "identity",position = position_dodge(width=0)) +
theme(axis.text.x=element_text(angle=90,hjust=1, vjust = 0.5))
)
videogames.clean %>% ggplot(aes(x = ESRB_Rating)) +
geom_bar()
videogames.clean %>%
count(Genre, sort = TRUE) %>%
ggplot(aes(x = reorder(Genre, -n), y = n)) +
geom_bar(stat = "identity") +
theme(axis.text.x=element_text(angle=45,hjust=1))
Here we looked at distribution of User Scores and Critic Scores as well as the average Critic and User Score over time.
videogames.clean %>% ggplot() +
geom_histogram(binwidth = 0.5,aes(x = Critic_Score, fill = 'pink') ) +
geom_histogram(binwidth = 0.5,aes(x = User_Score, fill = 'blue') )
We have a ton of publishers
videogames.clean %>% ggplot(aes(x = Year)) +
geom_bar()
vs_sales.byregion.byyear <- vs_byregion %>% group_by(Year, Region) %>% summarize(Sales = sum(Sales))
vs_sales.byregion.byyear %>% ggplot(aes(x=Year, y= Sales))+
geom_line(aes(color = Region))
videogames.clean %>% group_by(Year) %>% summarise(
User_Score = mean(User_Score, na.rm = T),
Critic_Score = mean(Critic_Score, na.rm = T),
Vgchartzscore = mean(Vgchartzscore, na.rm = T)) %>%
filter(Year >= 1989) %>%
mutate(User_Score2 = case_when(Year >= 1996 ~ User_Score,
TRUE ~ NaN)) %>%
gather(ScoreType, Score, c(User_Score,Critic_Score,Vgchartzscore), na.rm = T) %>%
ggplot(aes(x = Year)) + # TODO : Make look better
geom_line(aes(y = Score, color = ScoreType)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
scale_x_continuous('ID', labels = 1980:2018, breaks = 1980:2018) +
xlab('ID') +
xlim(1989, 2018
)
⁃ construct CI for population mean value for sales
⁃
⁃ assumptions:
⁃ sample is randomized: we will pick a random sample from our data
⁃ population distribution is approximately normal:
⁃ NO, very left skewed, but we will use large sample sizes so that will not be too much of a problem
⁃ Extreme outliers:
⁃ we will remove extreme outliers (GTA V)
SALES in SPORTS VS SHOOterS H_0: mean sales spots - mean sales shooter = 0 H_a: not equal 0